# Boyoon Lee "The Impact of Educational Content on Anti-Immigrant Attitudes"
# Main Analysis: Figure 1 (Textbook: proportion of words)
# Last updated: 2022-11-16


# Initial settings --------------------------------------------------------
# Set directory
setwd("Your path here")

# Packages
library(tidyverse)
library(tidytext)
# Packages need for word preparation
#library(stringr)
#library(readtext)
#library(jiebaR)
#library(tmcn)
#library(tm)
#library(quanteda)
#library(stopwords)
#library(xtable)


##########################################
##          Words in Textbooks          ##
##########################################

# Load data ---------------------------------------------------------------
load("./textbook_word_freq.Rda")


# NOTE: Frequency tables are made using the following code 
## Load data ---------------------------------------------------------------
#new<-readLines("New_history_book/New_history_book.txt",encoding="UTF-8")
#old<-readLines("Old_history_book1to3.txt",encoding="UTF-8")

## Erase empty lines
#new = new[! grepl('^\\s*$', new)]
#old = old[! grepl('^\\s*$', old)]


## Word Preparation ---------------------------------------------------------

## Segmentation
#cutter<-worker(bylines = T)
#segmented_tokens_new<-segment(new, cutter)
#segmented_tokens_old<-segment(old, cutter)
#segmented_space_new<-segmented_tokens_new %>% 
#  enframe() %>% 
#  unnest(value) 
#segmented_space_old<-segmented_tokens_old %>% 
#  enframe() %>% 
#  unnest(value) 

## Stopwords
## Quanteda
#ch_stop <- quanteda::stopwords("zh", source = "misc") %>% enframe(name = NULL)
#head(ch_stop)
#nrow(ch_stop)

## tmcn
#data("STOPWORDS")
#tmcn_stop <- stopwordsCN() %>% enframe(name = NULL)
#head(tmcn_stop)
#nrow(tmcn_stop)

## Combine stopwords
#stopwords<-full_join(ch_stop, tmcn_stop, by = "value") %>% 
#  distinct(value)
#stopwords
#stopwords<-stopwords %>%
#  rename(simp = value) %>%
#  mutate(trad = tmcn::toTrad(simp)) # traditional characters
#stopwords<-rbind(stopwords,c("图","圖"),c("K","K"),c("・","・"),c("H","H"),c("R","R")) # Include "Picture" and other symbols as stopwords as well

# Use stopword and get frequency
#word_freq_new<-segmented_space_new %>%
#  filter(!value %in% stopwords$trad) %>%
#  filter(value %>% str_detect(pattern = "\\D+")) %>% # remove words consisting of digits
#  count(value) %>%
#  mutate(prop = `n`/sum(`n`)) %>%
#  arrange(desc(n))
#word_freq_old<-segmented_space_old %>%
#  filter(!value %in% stopwords$trad) %>%
#  filter(value %>% str_detect(pattern = "\\D+")) %>% # remove words consisting of digits
#  count(value) %>%
#  mutate(prop = `n`/sum(`n`)) %>%
#  arrange(desc(n))

## Save the frequencies 
#word_freq_new$book<-"New textbook"
#word_freq_old$book<-"Old textbook"
#save(word_freq_new, word_freq_old, file="./textbook_word_freq.Rda")


# Figure 1 ---------------------------------------------------------------

# Combine the new and old frequencies
word_freq<-rbind(word_freq_new[1:20,],word_freq_old[1:20,])
word_freq$eng<-c("Taiwan","Japan","Tai(First letter of Taiwan)","Culture","Society","Aboriginal","Government","Han Chinese",
                 "Economy","Era","Period","Development","Discuss","Study","Dutch","China","Construct","Activity",
                 "(Japanese)Government","Mainland",
                 "My country","Aspect","Nation(Rep of China)", "Culture","China","Japan",
                 "Development","Politics","Zhong(First letter of China)","Era","Society","After",
                 "Taiwan","Economy","Chinese Communist Party","Government","Then","Mongolia","Sir(Prefix)","State")

# Bar plot
ink_colors <- rev(c("white", "black", "grey60", "grey75"))
freq<-ggplot(word_freq, aes(x = reorder_within(eng, prop, book), y = prop, fill = book)) +
  geom_col(show.legend = FALSE) + 
  geom_text(aes(label = value), color = "black", position = position_stack(vjust = 0.5)) + 
  facet_wrap(~book, scales = "free", labeller = labeller(labels)) + 
  coord_flip() +
  scale_x_reordered()+
  scale_fill_manual(values = ink_colors) +
  theme_bw()+
  theme(strip.text.x = element_text(size = 13),
        strip.background = element_rect(fill="grey90"))+
  theme(axis.text.x = element_text(color = "black", angle = 0, size=12)) +
  theme(axis.text.y = element_text(color = "black", size = 13)) +
  theme(panel.background = element_rect(fill = "white"), 
        plot.background = element_rect(fill = "white"), 
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank()) + 
  labs(x = NULL, y = "Proportion of Words")+
  theme(axis.title=element_text(size=14))
freq


